#Загрузка библиотек
library(dplyr)
##
## Присоединяю пакет: 'dplyr'
## Следующие объекты скрыты от 'package:stats':
##
## filter, lag
## Следующие объекты скрыты от 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
theme_set(theme_minimal())
library(ggpubr)
## Warning: пакет 'ggpubr' был собран под R версии 4.3.2
library(plotly)
## Warning: пакет 'plotly' был собран под R версии 4.3.2
##
## Присоединяю пакет: 'plotly'
## Следующий объект скрыт от 'package:ggplot2':
##
## last_plot
## Следующий объект скрыт от 'package:stats':
##
## filter
## Следующий объект скрыт от 'package:graphics':
##
## layout
library(rstatix)
## Warning: пакет 'rstatix' был собран под R версии 4.3.2
##
## Присоединяю пакет: 'rstatix'
## Следующий объект скрыт от 'package:stats':
##
## filter
#Задание 1
df = readRDS("life_expectancy_data.RDS")
summary(df)
## Country Year Gender Life expectancy
## Length:195 Min. :2019 Length:195 Min. :55.49
## Class :character 1st Qu.:2019 Class :character 1st Qu.:70.02
## Mode :character Median :2019 Mode :character Median :77.55
## Mean :2019 Mean :75.52
## 3rd Qu.:2019 3rd Qu.:80.95
## Max. :2019 Max. :88.10
## Unemployment Infant Mortality GDP GNI
## Min. : 0.178 Min. : 1.40 Min. :1.884e+08 Min. :3.754e+08
## 1st Qu.: 3.735 1st Qu.: 5.35 1st Qu.:1.117e+10 1st Qu.:1.094e+10
## Median : 5.960 Median :13.50 Median :3.967e+10 Median :4.009e+10
## Mean : 8.597 Mean :19.61 Mean :4.660e+11 Mean :4.864e+11
## 3rd Qu.:10.958 3rd Qu.:30.23 3rd Qu.:2.476e+11 3rd Qu.:2.457e+11
## Max. :36.442 Max. :75.80 Max. :2.143e+13 Max. :2.171e+13
## Clean fuels and cooking technologies Per Capita
## Min. : 0.00 Min. : 228.2
## 1st Qu.: 34.50 1st Qu.: 2165.3
## Median : 80.70 Median : 6624.8
## Mean : 65.98 Mean : 16821.0
## 3rd Qu.:100.00 3rd Qu.: 19439.7
## Max. :100.00 Max. :175813.9
## Mortality caused by road traffic injury Tuberculosis Incidence
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 8.20 1st Qu.: 12.0
## Median :16.00 Median : 46.0
## Mean :17.06 Mean :103.8
## 3rd Qu.:24.00 3rd Qu.:138.5
## Max. :64.60 Max. :654.0
## DPT Immunization HepB3 Immunization Measles Immunization Hospital beds
## Min. :35.00 Min. :35.00 Min. :37.00 Min. : 0.200
## 1st Qu.:85.69 1st Qu.:81.31 1st Qu.:84.85 1st Qu.: 1.301
## Median :92.00 Median :91.00 Median :92.00 Median : 2.570
## Mean :87.99 Mean :86.76 Mean :87.31 Mean : 2.997
## 3rd Qu.:97.00 3rd Qu.:96.00 3rd Qu.:96.50 3rd Qu.: 3.773
## Max. :99.00 Max. :99.00 Max. :99.00 Max. :13.710
## Basic sanitation services Tuberculosis treatment Urban population
## Min. : 8.632 Min. : 0.00 Min. : 13.25
## 1st Qu.: 62.919 1st Qu.: 73.00 1st Qu.: 41.92
## Median : 91.144 Median : 82.00 Median : 58.76
## Mean : 77.380 Mean : 77.57 Mean : 59.12
## 3rd Qu.: 98.582 3rd Qu.: 88.00 3rd Qu.: 78.02
## Max. :100.000 Max. :100.00 Max. :100.00
## Rural population Non-communicable Mortality Sucide Rate continent
## Min. : 0.00 Min. : 4.40 Min. : 0.300 Africa :52
## 1st Qu.:21.98 1st Qu.:11.85 1st Qu.: 2.050 Americas:38
## Median :41.24 Median :17.20 Median : 3.500 Asia :42
## Mean :40.88 Mean :17.05 Mean : 4.802 Europe :48
## 3rd Qu.:58.08 3rd Qu.:22.10 3rd Qu.: 6.600 Oceania :15
## Max. :86.75 Max. :43.70 Max. :30.100
#Задание 2
plot_ly(
data = df[(df$`Life expectancy` != 0) & (df$`Tuberculosis Incidence` != 0),],
x = ~ df$`Tuberculosis Incidence`,
y = ~ df$`Life expectancy`,
color = df$continent) %>%
layout(
title = 'Отношение продолжительности жизни и заболеваемостью туберкулезом',
yaxis = list(title = 'Ожидаемая продолжительность жизни',
zeroline = FALSE), # Уберём выделения нулевых осей по y
xaxis = list(title = 'Заболеваемость туберкулезом',
zeroline = FALSE)) # Уберём выделения нулевых осей по y
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
#Задание 3
newdf = df[df$continent %in% c('Africa', 'Americas'), ] #Новый дата-фрейм с выборкой только из Африки и Америки
ggqqplot(newdf[newdf$`Life expectancy`,],
x = "Life expectancy", facet.by = "continent")
ggplot(data = newdf,
aes(x = `Life expectancy`)) +
geom_histogram()+
facet_grid(newdf$continent)+
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
stat.test <- newdf %>%
wilcox_test(`Life expectancy` ~ continent) %>%
add_xy_position(x = "continent")
stat.test
## # A tibble: 1 × 11
## .y. group1 group2 n1 n2 statistic p y.position groups xmin
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <name> <dbl>
## 1 Life exp… Africa Ameri… 52 38 107 6.34e-13 86.6 <chr> 1
## # ℹ 1 more variable: xmax <dbl>
ggboxplot(
newdf,
x = "continent", y = "Life expectancy",
ylab = "Life expectancy", xlab = "Continent",
add = "jitter"
) +
labs(subtitle = get_test_label(stat.test, detailed = TRUE)) +
stat_pvalue_manual(stat.test, tip.length = 0)
#Задание 4
df_num = subset(df, select = -c(Country, Year, Gender, continent))
summary(df_num)
## Life expectancy Unemployment Infant Mortality GDP
## Min. :55.49 Min. : 0.178 Min. : 1.40 Min. :1.884e+08
## 1st Qu.:70.02 1st Qu.: 3.735 1st Qu.: 5.35 1st Qu.:1.117e+10
## Median :77.55 Median : 5.960 Median :13.50 Median :3.967e+10
## Mean :75.52 Mean : 8.597 Mean :19.61 Mean :4.660e+11
## 3rd Qu.:80.95 3rd Qu.:10.958 3rd Qu.:30.23 3rd Qu.:2.476e+11
## Max. :88.10 Max. :36.442 Max. :75.80 Max. :2.143e+13
## GNI Clean fuels and cooking technologies Per Capita
## Min. :3.754e+08 Min. : 0.00 Min. : 228.2
## 1st Qu.:1.094e+10 1st Qu.: 34.50 1st Qu.: 2165.3
## Median :4.009e+10 Median : 80.70 Median : 6624.8
## Mean :4.864e+11 Mean : 65.98 Mean : 16821.0
## 3rd Qu.:2.457e+11 3rd Qu.:100.00 3rd Qu.: 19439.7
## Max. :2.171e+13 Max. :100.00 Max. :175813.9
## Mortality caused by road traffic injury Tuberculosis Incidence
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 8.20 1st Qu.: 12.0
## Median :16.00 Median : 46.0
## Mean :17.06 Mean :103.8
## 3rd Qu.:24.00 3rd Qu.:138.5
## Max. :64.60 Max. :654.0
## DPT Immunization HepB3 Immunization Measles Immunization Hospital beds
## Min. :35.00 Min. :35.00 Min. :37.00 Min. : 0.200
## 1st Qu.:85.69 1st Qu.:81.31 1st Qu.:84.85 1st Qu.: 1.301
## Median :92.00 Median :91.00 Median :92.00 Median : 2.570
## Mean :87.99 Mean :86.76 Mean :87.31 Mean : 2.997
## 3rd Qu.:97.00 3rd Qu.:96.00 3rd Qu.:96.50 3rd Qu.: 3.773
## Max. :99.00 Max. :99.00 Max. :99.00 Max. :13.710
## Basic sanitation services Tuberculosis treatment Urban population
## Min. : 8.632 Min. : 0.00 Min. : 13.25
## 1st Qu.: 62.919 1st Qu.: 73.00 1st Qu.: 41.92
## Median : 91.144 Median : 82.00 Median : 58.76
## Mean : 77.380 Mean : 77.57 Mean : 59.12
## 3rd Qu.: 98.582 3rd Qu.: 88.00 3rd Qu.: 78.02
## Max. :100.000 Max. :100.00 Max. :100.00
## Rural population Non-communicable Mortality Sucide Rate
## Min. : 0.00 Min. : 4.40 Min. : 0.300
## 1st Qu.:21.98 1st Qu.:11.85 1st Qu.: 2.050
## Median :41.24 Median :17.20 Median : 3.500
## Mean :40.88 Mean :17.05 Mean : 4.802
## 3rd Qu.:58.08 3rd Qu.:22.10 3rd Qu.: 6.600
## Max. :86.75 Max. :43.70 Max. :30.100
library(corrplot)
## Warning: пакет 'corrplot' был собран под R версии 4.3.2
## corrplot 0.92 loaded
df_cor <- cor(df_num)
df_cor
## Life expectancy Unemployment
## Life expectancy 1.0000000 -0.122453828
## Unemployment -0.1224538 1.000000000
## Infant Mortality -0.8309072 0.103775270
## GDP 0.1688143 -0.111437568
## GNI 0.1786383 -0.109486971
## Clean fuels and cooking technologies 0.7637583 0.063975082
## Per Capita 0.6034817 -0.183778304
## Mortality caused by road traffic injury -0.6518097 0.173386776
## Tuberculosis Incidence -0.5831370 0.121480015
## DPT Immunization 0.5044753 -0.147098313
## HepB3 Immunization 0.4023880 -0.102304627
## Measles Immunization 0.5323483 -0.166019661
## Hospital beds 0.4849944 -0.147215966
## Basic sanitation services 0.8515922 0.029209789
## Tuberculosis treatment -0.3214166 -0.051270213
## Urban population 0.5829745 0.085895193
## Rural population -0.5829745 -0.085895193
## Non-communicable Mortality -0.6338148 0.131242386
## Sucide Rate 0.1593575 0.006558983
## Infant Mortality GDP
## Life expectancy -0.83090718 0.16881430
## Unemployment 0.10377527 -0.11143757
## Infant Mortality 1.00000000 -0.16907590
## GDP -0.16907590 1.00000000
## GNI -0.16096299 0.99435959
## Clean fuels and cooking technologies -0.77567408 0.13751753
## Per Capita -0.31858469 0.17389277
## Mortality caused by road traffic injury 0.65050858 -0.11544108
## Tuberculosis Incidence 0.56359507 -0.08695088
## DPT Immunization -0.59029923 0.10813790
## HepB3 Immunization -0.52710787 0.08719372
## Measles Immunization -0.58504641 0.10139187
## Hospital beds -0.52085961 0.13217297
## Basic sanitation services -0.77056506 0.14600318
## Tuberculosis treatment 0.27989549 -0.02495462
## Urban population -0.47460671 0.15210771
## Rural population 0.47460671 -0.15210771
## Non-communicable Mortality 0.66661171 -0.19176139
## Sucide Rate 0.05980403 0.11409037
## GNI
## Life expectancy 0.17863826
## Unemployment -0.10948697
## Infant Mortality -0.16096299
## GDP 0.99435959
## GNI 1.00000000
## Clean fuels and cooking technologies 0.13470096
## Per Capita 0.18266349
## Mortality caused by road traffic injury -0.11353833
## Tuberculosis Incidence -0.08693997
## DPT Immunization 0.10537313
## HepB3 Immunization 0.08210454
## Measles Immunization 0.09835173
## Hospital beds 0.13351160
## Basic sanitation services 0.15141620
## Tuberculosis treatment -0.02115128
## Urban population 0.16783616
## Rural population -0.16783616
## Non-communicable Mortality -0.18266265
## Sucide Rate 0.13051116
## Clean fuels and cooking technologies
## Life expectancy 0.76375825
## Unemployment 0.06397508
## Infant Mortality -0.77567408
## GDP 0.13751753
## GNI 0.13470096
## Clean fuels and cooking technologies 1.00000000
## Per Capita 0.38753491
## Mortality caused by road traffic injury -0.59553317
## Tuberculosis Incidence -0.54553537
## DPT Immunization 0.44831815
## HepB3 Immunization 0.38243439
## Measles Immunization 0.49991332
## Hospital beds 0.43564106
## Basic sanitation services 0.84316638
## Tuberculosis treatment -0.31357924
## Urban population 0.59435666
## Rural population -0.59435666
## Non-communicable Mortality -0.65526759
## Sucide Rate 0.00976739
## Per Capita
## Life expectancy 0.60348166
## Unemployment -0.18377830
## Infant Mortality -0.31858469
## GDP 0.17389277
## GNI 0.18266349
## Clean fuels and cooking technologies 0.38753491
## Per Capita 1.00000000
## Mortality caused by road traffic injury -0.41639961
## Tuberculosis Incidence -0.30760893
## DPT Immunization 0.21421912
## HepB3 Immunization 0.09303434
## Measles Immunization 0.21571852
## Hospital beds 0.24966424
## Basic sanitation services 0.45409579
## Tuberculosis treatment -0.32644726
## Urban population 0.42880232
## Rural population -0.42880232
## Non-communicable Mortality -0.35531918
## Sucide Rate 0.32281997
## Mortality caused by road traffic injury
## Life expectancy -0.6518097
## Unemployment 0.1733868
## Infant Mortality 0.6505086
## GDP -0.1154411
## GNI -0.1135383
## Clean fuels and cooking technologies -0.5955332
## Per Capita -0.4163996
## Mortality caused by road traffic injury 1.0000000
## Tuberculosis Incidence 0.4123296
## DPT Immunization -0.3400658
## HepB3 Immunization -0.2628041
## Measles Immunization -0.3107449
## Hospital beds -0.4909601
## Basic sanitation services -0.6320693
## Tuberculosis treatment 0.3130249
## Urban population -0.3718674
## Rural population 0.3718674
## Non-communicable Mortality 0.4071462
## Sucide Rate -0.1102582
## Tuberculosis Incidence DPT Immunization
## Life expectancy -0.58313705 0.50447529
## Unemployment 0.12148001 -0.14709831
## Infant Mortality 0.56359507 -0.59029923
## GDP -0.08695088 0.10813790
## GNI -0.08693997 0.10537313
## Clean fuels and cooking technologies -0.54553537 0.44831815
## Per Capita -0.30760893 0.21421912
## Mortality caused by road traffic injury 0.41232959 -0.34006575
## Tuberculosis Incidence 1.00000000 -0.37169763
## DPT Immunization -0.37169763 1.00000000
## HepB3 Immunization -0.31215616 0.94776877
## Measles Immunization -0.37364785 0.88078924
## Hospital beds -0.19543396 0.32366629
## Basic sanitation services -0.55532307 0.45942955
## Tuberculosis treatment 0.23672979 -0.13993470
## Urban population -0.33622933 0.22057595
## Rural population 0.33622933 -0.22057595
## Non-communicable Mortality 0.48089925 -0.38159200
## Sucide Rate 0.09858654 0.05567581
## HepB3 Immunization Measles Immunization
## Life expectancy 0.40238797 0.53234834
## Unemployment -0.10230463 -0.16601966
## Infant Mortality -0.52710787 -0.58504641
## GDP 0.08719372 0.10139187
## GNI 0.08210454 0.09835173
## Clean fuels and cooking technologies 0.38243439 0.49991332
## Per Capita 0.09303434 0.21571852
## Mortality caused by road traffic injury -0.26280410 -0.31074490
## Tuberculosis Incidence -0.31215616 -0.37364785
## DPT Immunization 0.94776877 0.88078924
## HepB3 Immunization 1.00000000 0.86161432
## Measles Immunization 0.86161432 1.00000000
## Hospital beds 0.27225503 0.33526203
## Basic sanitation services 0.38112985 0.50904494
## Tuberculosis treatment -0.09250053 -0.14092951
## Urban population 0.13692089 0.24604275
## Rural population -0.13692089 -0.24604275
## Non-communicable Mortality -0.31401541 -0.38626279
## Sucide Rate -0.01978305 0.02560727
## Hospital beds Basic sanitation services
## Life expectancy 0.4849944 0.85159219
## Unemployment -0.1472160 0.02920979
## Infant Mortality -0.5208596 -0.77056506
## GDP 0.1321730 0.14600318
## GNI 0.1335116 0.15141620
## Clean fuels and cooking technologies 0.4356411 0.84316638
## Per Capita 0.2496642 0.45409579
## Mortality caused by road traffic injury -0.4909601 -0.63206935
## Tuberculosis Incidence -0.1954340 -0.55532307
## DPT Immunization 0.3236663 0.45942955
## HepB3 Immunization 0.2722550 0.38112985
## Measles Immunization 0.3352620 0.50904494
## Hospital beds 1.0000000 0.47445249
## Basic sanitation services 0.4744525 1.00000000
## Tuberculosis treatment -0.1947393 -0.30065649
## Urban population 0.2740715 0.55069603
## Rural population -0.2740715 -0.55069603
## Non-communicable Mortality -0.3562093 -0.52254411
## Sucide Rate 0.2665261 0.15953741
## Tuberculosis treatment Urban population
## Life expectancy -0.32141658 0.58297452
## Unemployment -0.05127021 0.08589519
## Infant Mortality 0.27989549 -0.47460671
## GDP -0.02495462 0.15210771
## GNI -0.02115128 0.16783616
## Clean fuels and cooking technologies -0.31357924 0.59435666
## Per Capita -0.32644726 0.42880232
## Mortality caused by road traffic injury 0.31302487 -0.37186744
## Tuberculosis Incidence 0.23672979 -0.33622933
## DPT Immunization -0.13993470 0.22057595
## HepB3 Immunization -0.09250053 0.13692089
## Measles Immunization -0.14092951 0.24604275
## Hospital beds -0.19473929 0.27407149
## Basic sanitation services -0.30065649 0.55069603
## Tuberculosis treatment 1.00000000 -0.28393086
## Urban population -0.28393086 1.00000000
## Rural population 0.28393086 -1.00000000
## Non-communicable Mortality 0.26680379 -0.53028884
## Sucide Rate -0.07289482 0.08936862
## Rural population
## Life expectancy -0.58297452
## Unemployment -0.08589519
## Infant Mortality 0.47460671
## GDP -0.15210771
## GNI -0.16783616
## Clean fuels and cooking technologies -0.59435666
## Per Capita -0.42880232
## Mortality caused by road traffic injury 0.37186744
## Tuberculosis Incidence 0.33622933
## DPT Immunization -0.22057595
## HepB3 Immunization -0.13692089
## Measles Immunization -0.24604275
## Hospital beds -0.27407149
## Basic sanitation services -0.55069603
## Tuberculosis treatment 0.28393086
## Urban population -1.00000000
## Rural population 1.00000000
## Non-communicable Mortality 0.53028884
## Sucide Rate -0.08936862
## Non-communicable Mortality Sucide Rate
## Life expectancy -0.6338148 0.159357534
## Unemployment 0.1312424 0.006558983
## Infant Mortality 0.6666117 0.059804035
## GDP -0.1917614 0.114090369
## GNI -0.1826627 0.130511162
## Clean fuels and cooking technologies -0.6552676 0.009767390
## Per Capita -0.3553192 0.322819969
## Mortality caused by road traffic injury 0.4071462 -0.110258162
## Tuberculosis Incidence 0.4808992 0.098586543
## DPT Immunization -0.3815920 0.055675815
## HepB3 Immunization -0.3140154 -0.019783046
## Measles Immunization -0.3862628 0.025607272
## Hospital beds -0.3562093 0.266526138
## Basic sanitation services -0.5225441 0.159537407
## Tuberculosis treatment 0.2668038 -0.072894819
## Urban population -0.5302888 0.089368619
## Rural population 0.5302888 -0.089368619
## Non-communicable Mortality 1.0000000 0.184023972
## Sucide Rate 0.1840240 1.000000000
Визуализация корреляции №1
corrplot(df_cor, method = 'number', addCoef.col = 0.1,
number.cex = 0.5,
tl.cex = 0.4)
Визуализация корреляции №2
testRes = cor.mtest(df_num, conf.level = 0.95)
corrplot(df_cor, p.mat = testRes$p, sig.level = 0.10, order = 'hclust', addrect = 2, number.cex = 0.5, tl.cex = 0.4)
corrplot(df_cor, p.mat = testRes$p, method = 'color', diag = FALSE, type = 'upper',
sig.level = c(0.001, 0.01, 0.05), pch.cex = 0.9,
insig = 'label_sig', pch.col = 'grey20', order = 'AOE', number.cex=0.5, tl.cex = 0.4)
#Задание 5
df_clear_scaled <- scale(df_num)
head(df_clear_scaled)
## Life expectancy Unemployment Infant Mortality GDP GNI
## [1,] -1.1887512 0.78408167 1.37520928 -0.23131589 -0.23883882
## [2,] 0.6085937 0.39076754 -0.70358937 -0.23307415 -0.24083177
## [3,] 0.3395059 1.43864925 -0.05987047 -0.15219202 -0.16296179
## [4,] -1.4944027 -0.10922743 1.46970013 -0.19478831 -0.20674291
## [5,] 0.3357325 -0.04882006 -0.85713700 -0.24016716 -0.24779098
## [6,] 0.5817890 0.30129329 -0.74492912 -0.00727451 -0.02677993
## Clean fuels and cooking technologies Per Capita
## [1,] -0.8254549 -0.6680197
## [2,] 0.4052050 -0.4674731
## [3,] 0.9172919 -0.5250000
## [4,] -0.4510260 -0.5732820
## [5,] 0.9365639 0.0227355
## [6,] 0.9310577 -0.2767667
## Mortality caused by road traffic injury Tuberculosis Incidence
## [1,] -0.1116583 0.6348233
## [2,] -0.5166385 -0.6534901
## [3,] 0.3704610 -0.3183797
## [4,] 0.8718650 1.8412208
## [5,] -1.6447977 -0.7726405
## [6,] -0.2852213 -0.5566804
## DPT Immunization HepB3 Immunization Measles Immunization Hospital beds
## [1,] -1.7729743 -1.63130248 -1.7674223 -1.08432585
## [2,] 0.8879797 0.96139344 0.5828081 0.02352559
## [3,] 0.2428999 0.33286109 -0.5544002 -0.50598801
## [4,] -2.4986890 -2.65266754 -2.7530027 -0.92881827
## [5,] 0.5654398 0.96139344 0.4311803 -0.17591613
## [6,] -0.1602749 -0.05997162 0.5069942 0.68216505
## Basic sanitation services Tuberculosis treatment Urban population
## [1,] -1.0217399 0.7878907 -1.4325629
## [2,] 0.7851429 0.6118986 0.0903225
## [3,] 0.3154049 0.4945706 0.6037463
## [4,] -0.9357671 -0.5027177 0.3027323
## [5,] 0.2932369 -0.3110819 -1.4861375
## [6,] 0.5051511 -1.7933262 1.4108863
## Rural population Non-communicable Mortality Sucide Rate
## [1,] 1.4325629 2.70651902 -0.3177247
## [2,] -0.0903225 -1.56166057 -0.5555414
## [3,] -0.6037463 -0.60061351 -0.7933581
## [4,] -0.3027323 0.33216746 -0.6612377
## [5,] 1.4861375 0.07777265 -1.0575990
## [6,] -1.4108863 -0.69954482 -0.3969969
df_clear_dist <- dist(df_clear_scaled,
method = "euclidean"
)
df_clear_hc <- hclust(d = df_clear_dist,
method = "ward.D2")
#Задание 6
library(factoextra)
## Warning: пакет 'factoextra' был собран под R версии 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(df_clear_hc,
cex = 0.1) # cex() - размер лейблов
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(pheatmap)
## Warning: пакет 'pheatmap' был собран под R версии 4.3.2
pheatmap(df_clear_scaled,
show_rownames = FALSE,
clustering_distance_rows = df_clear_dist,
clustering_method = "ward.D2",
cutree_rows = 5,
cutree_cols = length(colnames(df_clear_scaled)),
angle_col = 45,
main = "Dendrograms for clustering rows and columns with heatmap")
Пояснение к заданию 6: по горизонтали расположены все переменные
дата-фрейма, по вертикали каждая из 195 стран, высота слияния древа
указывает на различие между двумя объектами/кластерами: чем выше высота
слияния, тем менее похожи объекты. Мы проделали сначала нормализацию
всех величин дата-фрейма, затем посчитали расстояние между каждыми
значениями и создали матрицу с расстоянием. Далее Ward построили
иерархическое древо по методу Ward. При построении хитмапа мы указали 5
рядов кластеризации. Визуально в первом сверху кластере можно увидеть
страны с низкими показателями иммунизации от кори, гепатита и АКДС. В
третьем кластере сверху выделяются страны с высокими значениями ВВП и
ВНД. Также в первом ряду выбивается кластер по колонке младенческой
смертности.
#Задание 7
library(FactoMineR)
## Warning: пакет 'FactoMineR' был собран под R версии 4.3.2
library(factoextra)
df_full.pca <- prcomp(df_num,
scale = T)
summary(df_full.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.7526 1.4841 1.3952 1.17177 1.08375 0.96347 0.9288
## Proportion of Variance 0.3988 0.1159 0.1025 0.07227 0.06182 0.04886 0.0454
## Cumulative Proportion 0.3988 0.5147 0.6172 0.68945 0.75126 0.80012 0.8455
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.85740 0.69263 0.68937 0.59106 0.54986 0.47085 0.36596
## Proportion of Variance 0.03869 0.02525 0.02501 0.01839 0.01591 0.01167 0.00705
## Cumulative Proportion 0.88421 0.90946 0.93447 0.95286 0.96877 0.98044 0.98749
## PC15 PC16 PC17 PC18 PC19
## Standard deviation 0.34546 0.26941 0.20224 0.06968 1.017e-15
## Proportion of Variance 0.00628 0.00382 0.00215 0.00026 0.000e+00
## Cumulative Proportion 0.99377 0.99759 0.99974 1.00000 1.000e+00
fviz_eig(df_full.pca, addlabels = T, ylim = c(0, 40))
fviz_pca_var(df_full.pca, col.var = "contrib")
fviz_pca_var(df_full.pca,
select.var = list(contrib = 3), # Задаём число здесь
col.var = "contrib")
fviz_contrib(df_full.pca, choice = "var", axes = 1, top = 24) # 1
fviz_contrib(df_full.pca, choice = "var", axes = 2, top = 24) # 2
fviz_contrib(df_full.pca, choice = "var", axes = 3, top = 24) # 3
Пояснение к заданию 7: первая компонента описывает почти 40% дисперсии.
Первые две компоненты объясняют 50% дисперсии. На втором графике явно
выделяются только четыре группы переменных: 1 группа - переменные,
относящиеся к иммунизации (Measles, DPT, HepB3 Immunization), 2 группа -
Hospital Beds, Basic Sanitation Services, Life expectancy, Clean fuels
and cooking technologies, 3 группа - Urban population, Per Capita, GDP,
GNI, Suicide Rate, 4 группа - другие (rural population, tuberculosis
treatment, Inf Mort, Tuberculosis incidence, road injury mortality,
Non-communicable mortality). Основная проблема: много переменных,
которые не коррелируют друг с другом.
#Задание 8
library(ggbiplot)
## Загрузка требуемого пакета: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Присоединяю пакет: 'plyr'
## Следующие объекты скрыты от 'package:rstatix':
##
## desc, mutate
## Следующие объекты скрыты от 'package:plotly':
##
## arrange, mutate, rename, summarise
## Следующий объект скрыт от 'package:ggpubr':
##
## mutate
## Следующие объекты скрыты от 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## Загрузка требуемого пакета: scales
## Загрузка требуемого пакета: grid
ggbiplot(df_full.pca,
scale=0, alpha = 0.1) +
theme_minimal()
df_num_with_ch = subset(df, select = -c(Country, Year, Gender))
summary(df_num_with_ch)
## Life expectancy Unemployment Infant Mortality GDP
## Min. :55.49 Min. : 0.178 Min. : 1.40 Min. :1.884e+08
## 1st Qu.:70.02 1st Qu.: 3.735 1st Qu.: 5.35 1st Qu.:1.117e+10
## Median :77.55 Median : 5.960 Median :13.50 Median :3.967e+10
## Mean :75.52 Mean : 8.597 Mean :19.61 Mean :4.660e+11
## 3rd Qu.:80.95 3rd Qu.:10.958 3rd Qu.:30.23 3rd Qu.:2.476e+11
## Max. :88.10 Max. :36.442 Max. :75.80 Max. :2.143e+13
## GNI Clean fuels and cooking technologies Per Capita
## Min. :3.754e+08 Min. : 0.00 Min. : 228.2
## 1st Qu.:1.094e+10 1st Qu.: 34.50 1st Qu.: 2165.3
## Median :4.009e+10 Median : 80.70 Median : 6624.8
## Mean :4.864e+11 Mean : 65.98 Mean : 16821.0
## 3rd Qu.:2.457e+11 3rd Qu.:100.00 3rd Qu.: 19439.7
## Max. :2.171e+13 Max. :100.00 Max. :175813.9
## Mortality caused by road traffic injury Tuberculosis Incidence
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 8.20 1st Qu.: 12.0
## Median :16.00 Median : 46.0
## Mean :17.06 Mean :103.8
## 3rd Qu.:24.00 3rd Qu.:138.5
## Max. :64.60 Max. :654.0
## DPT Immunization HepB3 Immunization Measles Immunization Hospital beds
## Min. :35.00 Min. :35.00 Min. :37.00 Min. : 0.200
## 1st Qu.:85.69 1st Qu.:81.31 1st Qu.:84.85 1st Qu.: 1.301
## Median :92.00 Median :91.00 Median :92.00 Median : 2.570
## Mean :87.99 Mean :86.76 Mean :87.31 Mean : 2.997
## 3rd Qu.:97.00 3rd Qu.:96.00 3rd Qu.:96.50 3rd Qu.: 3.773
## Max. :99.00 Max. :99.00 Max. :99.00 Max. :13.710
## Basic sanitation services Tuberculosis treatment Urban population
## Min. : 8.632 Min. : 0.00 Min. : 13.25
## 1st Qu.: 62.919 1st Qu.: 73.00 1st Qu.: 41.92
## Median : 91.144 Median : 82.00 Median : 58.76
## Mean : 77.380 Mean : 77.57 Mean : 59.12
## 3rd Qu.: 98.582 3rd Qu.: 88.00 3rd Qu.: 78.02
## Max. :100.000 Max. :100.00 Max. :100.00
## Rural population Non-communicable Mortality Sucide Rate continent
## Min. : 0.00 Min. : 4.40 Min. : 0.300 Africa :52
## 1st Qu.:21.98 1st Qu.:11.85 1st Qu.: 2.050 Americas:38
## Median :41.24 Median :17.20 Median : 3.500 Asia :42
## Mean :40.88 Mean :17.05 Mean : 4.802 Europe :48
## 3rd Qu.:58.08 3rd Qu.:22.10 3rd Qu.: 6.600 Oceania :15
## Max. :86.75 Max. :43.70 Max. :30.100
plot = ggbiplot(df_full.pca,
scale=0,
groups = as.factor(df_num_with_ch$continent),
ellipse = T,
alpha = 0.2) +
theme_minimal()
ggplotly(plot)
#Задание 9 Интерпретация РСА анализа: большое количество переменных затрудняет интерпретацию. Видно, что в Европе наибольший вес оказывают переменные: Hospital Beds, Basic Sanitation Services, Life expectancy, Clean fuels and cooking technologies. Америка и Азия перекрывают друг друга больше остальных контитнентов. В Африке наибольший вес оказывают переменные: rural population, tuberculosis treatment, Inf Mort, Tuberculosis incidence, road injury mortality, Non-communicable mortality. Можно сделать вывод, что в Африке больше сельского населения, выше младенческая смертность, заболеваемость туберкулезом, смертность в ДТП и т.д. В Европе наоборот выше продолжительность жизни, уровень санитарии и т.д.
#Задание 10
#Задание 11
#PCA1
df_pca1 = subset(df, select = -c(Country, Year, Gender, continent, GDP, GNI, `Life expectancy`,`HepB3 Immunization`, `Measles Immunization`))
pca1 <- prcomp(df_pca1,
scale = T)
summary(pca1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4484 1.2142 1.1844 1.00213 0.94281 0.85779 0.80194
## Proportion of Variance 0.4282 0.1053 0.1002 0.07173 0.06349 0.05256 0.04594
## Cumulative Proportion 0.4282 0.5335 0.6337 0.70542 0.76891 0.82147 0.86741
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.68575 0.68504 0.58175 0.53716 0.41185 0.34666
## Proportion of Variance 0.03359 0.03352 0.02417 0.02061 0.01212 0.00858
## Cumulative Proportion 0.90100 0.93452 0.95869 0.97930 0.99142 1.00000
## PC14
## Standard deviation 5.007e-16
## Proportion of Variance 0.000e+00
## Cumulative Proportion 1.000e+00
fviz_eig(pca1, addlabels = T, ylim = c(0, 40))
fviz_pca_var(pca1, col.var = "contrib")
fviz_pca_var(pca1,
select.var = list(contrib = 3), # Задаём число здесь
col.var = "contrib")
fviz_contrib(pca1, choice = "var", axes = 1, top = 24) # 1
fviz_contrib(pca1, choice = "var", axes = 2, top = 24) # 2
fviz_contrib(pca1, choice = "var", axes = 3, top = 24) # 3
ggbiplot(pca1,
scale=0, alpha = 0.1) +
theme_minimal()
#PCA2
df_pca2 = subset(df_pca1, select = -c(`DPT Immunization`, `Unemployment`, `Infant Mortality`, `Per Capita`, `Hospital beds`))
pca2 <- prcomp(df_pca2,
scale = T)
summary(pca2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1144 1.0877 1.0216 0.90733 0.73692 0.66282 0.60591
## Proportion of Variance 0.4967 0.1315 0.1159 0.09147 0.06034 0.04881 0.04079
## Cumulative Proportion 0.4967 0.6282 0.7441 0.83562 0.89595 0.94477 0.98556
## PC8 PC9
## Standard deviation 0.36049 8.036e-16
## Proportion of Variance 0.01444 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
fviz_eig(pca2, addlabels = T, ylim = c(0, 40))
fviz_pca_var(pca2, col.var = "contrib")
fviz_pca_var(pca2,
select.var = list(contrib = 3), # Задаём число здесь
col.var = "contrib")
fviz_contrib(pca1, choice = "var", axes = 1, top = 24) # 1
fviz_contrib(pca1, choice = "var", axes = 2, top = 24) # 2
fviz_contrib(pca1, choice = "var", axes = 3, top = 24) # 3
ggbiplot(pca2,
scale=0, alpha = 0.1) +
theme_minimal()
#PCA3
df_pca3 = subset(df_pca2, select = -c(`Sucide Rate`, `Rural population`, `Urban population`, `Tuberculosis Incidence`, `Tuberculosis treatment`))
pca3 <- prcomp(df_pca3,
scale = T)
summary(pca3)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.688 0.7817 0.6361 0.36929
## Proportion of Variance 0.712 0.1528 0.1011 0.03409
## Cumulative Proportion 0.712 0.8648 0.9659 1.00000
fviz_eig(pca3, addlabels = T, ylim = c(0, 40))
fviz_pca_var(pca3, col.var = "contrib")
fviz_pca_var(pca3,
select.var = list(contrib = 3), # Задаём число здесь
col.var = "contrib")
fviz_contrib(pca1, choice = "var", axes = 1, top = 24) # 1
fviz_contrib(pca1, choice = "var", axes = 2, top = 24) # 2
fviz_contrib(pca1, choice = "var", axes = 3, top = 24) # 3
ggbiplot(pca3,
scale=0, alpha = 0.1) +
theme_minimal()
Пояснение к заданию 11: кумулятивный процент объясненной вариации
повышается при каждом повторе эксперимента. К третьему повтору первая
главная компонента объясняет 71 процент вариации.